based on the project proposed at https://blog.jovian.ai/whatsapp-message-exploratory-data-analysis-eda-538560ee1c99.
Question 1: What is the period in which the data was collected?
Question 2: Which users have the most Chat/messages in the group?
Question 3: Which emojis use the most in the group?
Question 4: Which emoji is most used by each user?
Question 5: What are the top words used in the conversation?
Question 6: What are the top words used by each user?
Question 7: What is the most active time of messages in group throughout the day?
Question 8: What is the most active time for each user?
Question 9: What is the most active day of messages in group throughout the week?
Question 10: What is the most active day of each user in the group?
Question 11: What is the most active month of messages in group?
Question 12: What is the most active month of each user in the group?
import plotly.express as px
import os
import pandas as pd
import re
import datetime as time
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import emoji
import re
from collections import Counter
from wordcloud import WordCloud
import nltk
Data is divided into two columns. In column 0 it shows a date that message was sent. Column 1 presents the time, sender and a sent message. It is possible to notice through the info function that there is missing data in column 1.
data = pd.read_fwf('chat.txt', header= None)
data.head()
data.info()
The purpose of the following function is to transport the data contained in the chat.txt file to a pandas dataframe. For that, we created a dataframe with three columns. The timestamp column presents the date and time of each message. The user column, shows the sender of the message. And finally, the message column that presents the content sent. Then all messages containing media files such as photos and audios were removed. These messages differ from others in that they contain the text < Media omitted>. This type of message consist in 3855 examples.
def txt_to_dataframe(txt_file):
#regex used to read messages with more one line
pattern = re.compile(r'^(\d+/\d\d/\d\d.*?)(?=^^\d+/\d\d/\d\d|\Z)', re.S | re.M)
with open(txt_file) as file:
data = [m.group(1).strip().replace('\n', ' ') for m in pattern.finditer(file.read())]
user = []
message = []
timestamp = []
for row in data:
#get date and time
try:
time = row.split(' - ')[0]
timestamp.append(time)
except:
datetime.append(None)
# get sender
try:
u = re.search('- (.*?):', row).group(1)
user.append(u)
except:
user.append('')
# message content
try:
message.append(row.split(': ', 1)[1])
except:
message.append('')
df = pd.DataFrame(zip(timestamp, user, message), columns=['timestamp', 'user', 'message'])
df['timestamp'] = pd.to_datetime(df.timestamp)
# remove events not associated with a sender
df = df[df.user != ''].reset_index(drop=True)
return df
df = txt_to_dataframe('chat.txt')
df.head()
img = df.loc[df['message'].str.contains("omitted")]
print("Amount of data with media content removed:", img.shape[0])
df.drop(img.index, inplace=True)
df.reset_index(inplace=True, drop=True)
print("Start at",df['timestamp'].min())
print("End at",df['timestamp'].max())
def most_active_user(df, title):
matplotlib.rcParams['font.size'] = 20
matplotlib.rcParams['figure.figsize'] = (20, 8)
# Using the seaborn style
sns.set_style("darkgrid")
plt.figure()
plt.title(title);
sns.barplot(x=df.user, y=df.Number_of_messages, data = df,dodge=False)
most_chat = df['user'].value_counts().rename_axis('user').reset_index(name='Number_of_messages')
most_chat
most_active_user(most_chat, "User with most messages")
def get_emojis_list(df):
emoji_counter = Counter()
emoji_list = list(emoji.UNICODE_EMOJI_ENGLISH.keys())
r = re.compile('|'.join(re.escape(p) for p in emoji_list))
for idx, row in df.iterrows():
found = r.findall(row["message"])
for emojif in found:
emoji_counter[emojif] += 1
return emoji_counter
def most_common_emojis(df, number):
emoji_counter = get_emojis_list(df)
emojis = emoji_counter.most_common(number)
most_common = pd.DataFrame(emojis, columns=['emoji', 'count'])
return most_common
emoji_df = df.copy()
most_emojis = most_common_emojis(emoji_df, 10)
most_emojis
def emojis_pie_chart(most_emojis, title):
fig = px.pie(most_emojis, values='count', names='emoji',title=title)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
emojis_pie_chart(most_emojis, 'Emoji percentage used in chat group')
#most emojis by user
users = emoji_df['user'].unique()
for user in users:
subdf = emoji_df.loc[emoji_df['user']==user]
most_emojis = most_common_emojis(subdf, 10)
title = "Emoji percentage used in chat group by "+str(user)
emojis_pie_chart(most_emojis, title)
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')
def most_words(df, title):
word = " ".join(txt for txt in df.message)
wordcloud = WordCloud(width = 800, height =800 ,stopwords=stopwords, background_color="black",min_font_size = 10).generate(word)
plt.figure( figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title(title)
plt.axis("off")
plt.show()
word_df = df.copy()
most_words(word_df, "Most word writted in chat group")
#most emojis by user
users = word_df['user'].unique()
for user in users:
subdf = word_df.loc[word_df['user']==user]
title = "Most word writted in chat group by "+str(user)
most_words(word_df, title)
def most_active_hours(df, title):
matplotlib.rcParams['font.size'] = 20
matplotlib.rcParams['figure.figsize'] = (20, 8)
# Using the seaborn style
sns.set_style("darkgrid")
plt.figure()
plt.title(title);
sns.barplot(x=df.Hour, y=df.Number_of_messages, data = df,dodge=False)
most_active = df.copy()
most_active['Hour'] = most_active['timestamp'].apply(lambda x: x.hour)
active = most_active['Hour'].value_counts().rename_axis('Hour').reset_index(name='Number_of_messages')
active
most_active_hours(active, "Most hour activate chat group")
users = most_active['user'].unique()
for user in users:
subdf = most_active.loc[most_active['user']==user]
active = subdf['Hour'].value_counts().rename_axis('Hour').reset_index(name='Number_of_messages')
title = "Most active hour by "+str(user)
most_active_hours(active, title)
def most_active_day(df, title):
matplotlib.rcParams['font.size'] = 20
matplotlib.rcParams['figure.figsize'] = (20, 8)
# Using the seaborn style
sns.set_style("darkgrid")
plt.figure()
plt.title(title);
sns.barplot(x=df.Day,y=df.Number_of_messages, data = df,dodge=False)
most_day = df.copy()
most_day['Day'] = most_active['timestamp'].apply(lambda x: x.day_name())
active = most_day['Day'].value_counts().rename_axis('Day').reset_index(name='Number_of_messages')
active
most_active_day(active, "Most active day chat group")
users = most_day['user'].unique()
for user in users:
subdf = most_day.loc[most_day['user']==user]
active = subdf['Day'].value_counts().rename_axis('Day').reset_index(name='Number_of_messages')
title = "Most active day by "+str(user)
most_active_day(active, title)
def most_active_month(df, title):
matplotlib.rcParams['font.size'] = 20
matplotlib.rcParams['figure.figsize'] = (20, 8)
# Using the seaborn style
sns.set_style("darkgrid")
plt.figure()
plt.title(title);
sns.barplot(x=df.Month,y=df.Number_of_messages, data = df,dodge=False)
most_month = df.copy()
most_month['Month'] = most_active['timestamp'].apply(lambda x: x.month_name())
active = most_month['Month'].value_counts().rename_axis('Month').reset_index(name='Number_of_messages')
active
most_active_month(active, "Most active month chat group")
users = most_month['user'].unique()
for user in users:
subdf = most_month.loc[most_day['user']==user]
active = subdf['Month'].value_counts().rename_axis('Month').reset_index(name='Number_of_messages')
title = "Most active month by "+str(user)
most_active_month(active, title)